1. R libraries and software

library(tidyverse)
library(colorout)
library(here)
library(ggplot2)
library(scales)
library(reticulate)
library(qvalue)
library(flextable)
library(officer)
library(ggvenn)
library(extrafont)
library(forcats)
library(ggrepel)
library(ggtext)
library(R.SamBada)
library(rgdal)
library(stats)
library(geosphere)
library(pcadapt)

2. pcadapt with all 3 populations (MAN, NEW and AUT)

pcadapt performs principal component analysis and computes p-values to test for outliers. The test for outliers is based on the correlations between genetic variation and the first K principal components. pcadapt also handles Pool-seq data for which the statistical analysis is performed on the genetic markers frequencies. Returns an object of class pcadapt.

# read the file
x <- read.pcadapt(here("output", "outflank","autogenous.bed"), type = "bed")
# Run pcadapt with a large number of PCs
pcadapt_res <- pcadapt(x, K = 30)

# Create a scree plot
plot(pcadapt_res, option = "screeplot")

Choose PCs

# perform the analysis
pcadapt_res <- pcadapt(
  x, 
  method = c("mahalanobis"),
  min.maf = 0.1,
  LD.clumping = NULL,
  tol = 1e-04,
  K = 2) # choose the right K for your data

str(pcadapt_res)
## List of 11
##  $ scores         : num [1:60, 1:2] 0.0907 0.1033 0.1002 0.102 0.063 ...
##  $ singular.values: num [1:2] 0.381 0.271
##  $ loadings       : num [1:41620, 1:2] -0.00725 NA 0.00481 0.00327 0.00398 ...
##  $ zscores        : num [1:41620, 1:2] -4.84 NA 2.6 1.95 2.12 ...
##  $ af             : num [1:41620] 0.879 0.973 0.723 0.858 0.617 ...
##  $ maf            : num [1:41620] 0.1207 0.0273 0.2768 0.1417 0.3833 ...
##  $ chi2.stat      : num [1:41620] 4.214 NA 1.094 0.981 0.467 ...
##  $ stat           : num [1:41620] 5.699 NA 1.479 1.327 0.632 ...
##  $ gif            : num 1.35
##  $ pvalues        : num [1:41620] 0.122 NA 0.579 0.612 0.792 ...
##  $ pass           : int [1:31945] 1 3 4 5 6 7 8 9 10 12 ...
##  - attr(*, "K")= num 2
##  - attr(*, "method")= chr "mahalanobis"
##  - attr(*, "min.maf")= num 0.1
##  - attr(*, "class")= chr "pcadapt"
#   ____________________________________________________________________________
#   import the bim file with the SNP data                                   ####
snps <-
  read_delim(                    # to learn about the options use here, run ?read_delim on the console.
    here(
      "output", "outflank", "autogenous.bim"
    ),                           # use library here to load it
    col_names      = FALSE,      # we don't have header in the input file
    show_col_types = FALSE,      # suppress message from read_delim
    col_types      = "ccidcc"    # set the class of each column
  )
#
# set column names
colnames(
  snps
) <-                             # to add a header in our tibble
  c(
    "Scaffold", "SNP", "Cm", "Position", "Allele1", "Allele2"
  )
#
# check the tibble
head(snps)
## # A tibble: 6 × 6
##   Scaffold SNP             Cm Position Allele1 Allele2
##   <chr>    <chr>        <int>    <dbl> <chr>   <chr>  
## 1 1        AX-581444870     0    97856 C       T      
## 2 1        AX-583035083     0   305518 A       G      
## 3 1        AX-583035102     0   308124 A       G      
## 4 1        AX-583033342     0   315059 C       G      
## 5 1        AX-583035163     0   315386 A       G      
## 6 1        AX-583035198     0   330908 G       T
# Combine into a data frame
pcadapt_res_df <- data.frame(
  SNP = snps$SNP,
  Chromosome = snps$Scaffold,
  Position = snps$Position,
  pvalues = pcadapt_res$pvalues,
  stat = pcadapt_res$stat
) |> drop_na()
head(pcadapt_res_df)
##            SNP Chromosome Position   pvalues      stat
## 1 AX-581444870          1    97856 0.1215846 5.6988143
## 2 AX-583035102          1   308124 0.5787388 1.4791127
## 3 AX-583033342          1   315059 0.6122633 1.3268178
## 4 AX-583035163          1   315386 0.7916782 0.6317767
## 5 AX-583035198          1   330908 0.6550057 1.1443128
## 6 AX-583035257          1   442875 0.2441440 3.8133644

Adjust

# Adjust the p-values using Benjamini-Hochberg method
padj <- p.adjust(pcadapt_res_df$pvalues,method="BH")

# Define significance level
alpha <- 0.05

# Get indices of significant SNPs
outliers_indices <- which(padj < alpha)

# Get the SNP IDs of the outliers
outlier_snps <- pcadapt_res_df$SNP[outliers_indices]

# Print the SNP IDs
length(outlier_snps)
## [1] 42
# Save it
write.table(
  outlier_snps,
  file = here("output", "pcadapt","SNPs_pcadapt.txt"),
  row.names = FALSE,
  quote = FALSE,
  col.names = FALSE,
  sep = "\n"
)
# Set colors for the chromosomes
color_vector <- c("#CCF6D6", "#F6E1CC", "#CCD8F6")  # Add or remove colors as needed.
names(color_vector) <- unique(pcadapt_res_df$Chromosome)  # Make sure unique(df_sub$Chromosome) gives all unique Chromosome values.

# Adjust the p-values using Benjamini-Hochberg method
padj <- p.adjust(pcadapt_res_df$pvalues,method="BH")

# Define significance level
alpha <- 0.05

# Get indices of significant SNPs
outliers_indices <- which(padj < alpha)

# Create a new 'highlight' column, initially set to FALSE
pcadapt_res_df$highlight <- FALSE

# Update the 'highlight' column to highlight these outlier SNPs
pcadapt_res_df$highlight[outliers_indices] <- TRUE

# Identify the SNP with the smallest p-value in each chromosome among the associated SNPs
min_pval_snps <- pcadapt_res_df |>
  filter(highlight) |>
  group_by(Chromosome) |>
  slice(which.min(pvalues))

# Custom label function
k_format <- function(x) {
  paste0(format(x / 1e6, big.mark = "", scientific = FALSE), "k")
}

# source the plotting function
source(
  here(
    "scripts", "analysis", "my_theme2.R"
  )
)

# Create the plot
ggplot(pcadapt_res_df, aes(x = Position, y = -log10(pvalues))) +
  geom_point(aes(color = Chromosome),
             data = subset(pcadapt_res_df, highlight == FALSE),
             size = .5) +
  geom_point(
    color = "magenta",
    data = subset(pcadapt_res_df, highlight == TRUE),
    size = .5
  ) +
  scale_color_manual(values = color_vector, guide = "none") +
  labs(title = "pcadapt Brazil", x = "Position", y = "-log10(p-value)", caption = "SNPs highlighted in magenta are significant after Benjamini-Hochberg adjustment (p<0.05).") +
  facet_wrap(~ Chromosome, scales = "free_x") +
  scale_x_continuous(labels = k_format) +
  my_theme() +
  theme(panel.spacing = unit(.2, "lines"),
        plot.margin = unit(c(1, 1, 2, 2), "lines"),
        plot.caption = element_markdown(face = "italic", color = "#574E4E")) +
  # Annotate only the SNP with the smallest p-value for each chromosome among the associated SNPs
  geom_text_repel(
    data = min_pval_snps,
    aes(label = SNP),
    size = 3,
    nudge_y = 0.01,
    segment.color = NA,
    max.overlaps = Inf # Set to 'Inf' to show all annotations
  )

# Save the plot
ggsave(
  here(
    "output", "pcadapt", "figures", "pcadapt.pdf"
  ),
  width  = 8,
  height = 5,
  units  = "in"
)
plot(pcadapt_res, option = "qqplot")

plot(pcadapt_res, option = "scores")

hist(pcadapt_res$pvalues, xlab = "p-values", main = NULL, breaks = 50, col = "pink")

plot(pcadapt_res, option = "stat.distribution")

Choosing a cutoff for outlier detection

q-values

qval <- qvalue(pcadapt_res$pvalues)$qvalues
alpha <- 0.05
outliers <- which(qval < alpha)
length(outliers)
## [1] 43

Benjamini-Hochberg Procedure

padj <- p.adjust(pcadapt_res$pvalues,method="BH")
alpha <- 0.05
outliers <- which(padj < alpha)
length(outliers)
## [1] 42

Bonferroni correction

padj <- p.adjust(pcadapt_res$pvalues,method="bonferroni")
alpha <- 0.05
outliers <- which(padj < alpha)
length(outliers)
## [1] 13

Loadings

par(mfrow = c(2, 2))
for (i in 1:2)
  plot(pcadapt_res$loadings[, i], pch = 19, cex = .3, ylab = paste0("Loadings PC", i))

2.1 Venn diagram outFlank vs pcadapt

# Read data from txt files
outflank_snps <-
  read.table(
    here("output", "outflank", "SNPs_outFlank.txt"),
    stringsAsFactors = FALSE
  ) |>
  drop_na()
pcadapt_snps <-
  read.table(
    here("output", "pcadapt", "SNPs_pcadapt.txt"),
    stringsAsFactors = FALSE
  )

# Create a list with all dataframes
list_of_clusters <- list("outFlank" = outflank_snps$V1, "pcadapt" = pcadapt_snps$V1)


# Create Venn diagram
venn_diagram <- ggvenn(list_of_clusters, fill_color = c("steelblue", "darkorange"))
print(venn_diagram)

# Find common SNPs
common_SNPs <- Reduce(intersect, list_of_clusters)

# Save the shared SNP ids into a txt file
write.table(
  common_SNPs,
  file = here("output", "pcadapt", "common_SNPs_pcadapt_outflank.txt"),
  row.names = FALSE,
  col.names = FALSE,
  quote = FALSE
)

# # Save Venn diagram to PDF
output_path <- here("output", "pcadapt", "figures", "significant_snps.pdf")
ggsave(output_path, venn_diagram, height = 5, width = 5, dpi = 300)

With python Clean env

py_run_string("import gc; gc.collect()")

Venn

import pandas as pd
from matplotlib_venn import venn2
import matplotlib.pyplot as plt
import gc

# File paths
path_outflank = 'output/outflank/SNPs_outFlank.txt'
path_pcadapt = 'output/pcadapt/SNPs_pcadapt.txt'
pdf_output_path = 'output/pcadapt/venn_3_pop_pcadapt_outflank.pdf'  # PDF output file path

# Read the data (assuming the SNP IDs are in the first column)
outflank_snps = pd.read_csv(path_outflank, sep="\t", header=None)
pcadapt_snps = pd.read_csv(path_pcadapt, sep="\t", header=None)

# Drop NA values if any
outflank_snps.dropna(inplace=True)
pcadapt_snps.dropna(inplace=True)

# Create sets from the first column of each dataframe
set_outflank = set(outflank_snps[0])
set_pcadapt = set(pcadapt_snps[0])

# Create a Venn diagram
venn2([set_outflank, set_pcadapt], ('outFlank', 'pcadapt'), set_colors=('blue', 'gray'))
## <matplotlib_venn._common.VennDiagram object at 0x7fcda1e33700>
# Save the plot in PDF format
plt.savefig(pdf_output_path, format='pdf')

# Show the plot
plt.show()

# Clearing the environment
del path_outflank, path_pcadapt, pdf_output_path, outflank_snps, pcadapt_snps, set_outflank, set_pcadapt
plt.clf()  # Clear the current figure
plt.cla()  # Clear the current axes

# Garbage collection
gc.collect()
## 396

Create plot with the shared SNPs

# Filter the dataframe
filtered_df <- pcadapt_res_df |> filter(SNP %in% common_SNPs)

# Create the plot
ggplot(pcadapt_res_df, aes(x = Position, y = -log10(pvalues))) +
  geom_point(
    aes(color = Chromosome),
    data = subset(pcadapt_res_df, highlight == FALSE),
    size = .3
  ) +
  geom_point(
    color = "magenta",
    data = subset(filtered_df, highlight == TRUE),
    size = .3
  ) +
  scale_color_manual(values = color_vector, guide = "none") +
  labs(x = "Position", y = "-log10(p-value)", caption = "SNPs highlighted in magenta are significant after \n Benjamini-Hochberg adjustment (p<0.001) with pcadapt, \nand were identified as outliers with outFlank") +
  facet_wrap( ~ Chromosome, scales = "free_x") +
  scale_x_continuous(labels = k_format) +
  my_theme() +
  theme(
    panel.spacing = unit(.2, "lines"),
    plot.margin = unit(c(1, 1, 2, 2), "lines"),
    plot.caption = element_text(face = "italic")
  ) +
  # Annotate only the SNP with the smallest p-value for each chromosome among the associated SNPs
  geom_text_repel(
    data = min_pval_snps,
    aes(label = SNP),
    size = 3,
    nudge_y = 0.01,
    segment.color = NA,
    max.overlaps = Inf # Set to 'Inf' to show all annotations
  )

# Save the plot
ggsave(
  here(
    "output", "pcadapt", "figures", "pcadapt_outflank.pdf"
  ),
  width  = 8,
  height = 5,
  units  = "in"
)

We can create windows and count how many highlighted SNPs we have in each window. Then we can focus in genomic areas with several SNPs

cluster_df <- filtered_df %>%
  mutate(Window_Start = floor(Position / 10000000) * 10000000) %>%
  group_by(Chromosome, Window_Start) %>%
  summarise(SNP_count = n(), .groups = 'drop')

# Create the plot
ggplot(pcadapt_res_df, aes(x = Position, y = -log10(pvalues))) +
  geom_point(
    aes(color = Chromosome),
    data = subset(pcadapt_res_df, highlight == FALSE),
    size = .3
  ) +
  geom_point(
    color = "magenta",
    data = subset(filtered_df, highlight == TRUE),
    size = .3
  ) +
  geom_vline(
    data = cluster_df,
    aes(xintercept = Window_Start + 5000000),
    color = "lightgray",
    linetype = "dashed",
    linewidth = 0.5
  ) +
  geom_text_repel(
    data = cluster_df,
    aes(x = Window_Start + 5000000, y = 15, label = SNP_count),
    vjust = -1,
    hjust = -0.5
  ) +
  scale_color_manual(values = color_vector, guide = "none") +
  labs(x = "Position", y = "-log10(p-value)", caption = "SNPs highlighted in magenta are significant after \n Benjamini-Hochberg adjustment (p<0.001) with pcadapt, \nand were identified as outliers with outFlank \n Gray bars represent 10Mb windows and \n the number of SNPs per window is annotated in black.") +
  facet_wrap( ~ Chromosome, scales = "free_x") +
  scale_x_continuous(labels = k_format) +
  my_theme() +
  theme(
    panel.spacing = unit(.2, "lines"),
    plot.margin = unit(c(1, 1, 2, 2), "lines"),
    plot.caption = element_text(face = "italic"),
    legend.position = "none" # Remove legend
  ) +
  geom_text_repel(
    data = min_pval_snps,
    aes(label = SNP),
    size = 3,
    nudge_y = 0.01,
    segment.color = NA,
    max.overlaps = Inf # Set to 'Inf' to show all annotations
  )

Check the SNP ids for each cluster

# Create a new dataframe to capture SNPs within each 100kb window for each chromosome
cluster_snps_df <- filtered_df %>%
  mutate(Window_Start = floor(Position / 10000000) * 10000000) %>%
  group_by(Chromosome, Window_Start) %>%
  summarise(SNPs = list(SNP), .groups = 'drop')

# Unnest the SNPs column to show each SNP on its own row
unnested_cluster_snps_df <- cluster_snps_df %>% 
  unnest(SNPs)

# View the unnested data
head(unnested_cluster_snps_df)
## # A tibble: 6 × 3
##   Chromosome Window_Start SNPs        
##   <chr>             <dbl> <chr>       
## 1 1             140000000 AX-583515734
## 2 2              30000000 AX-584444558
## 3 2              50000000 AX-584502184
## 4 2             180000000 AX-585196879
## 5 2             320000000 AX-584907866
## 6 2             380000000 AX-579548089
# Convert the list column to a string column
cluster_snps_df_str <- cluster_snps_df %>%
  mutate(SNPs = sapply(SNPs, function(x) paste(x, collapse = ", ")))

# Create the flextable
my_flextable <- flextable(cluster_snps_df_str, cwidth = 2.75)

my_flextable <- autofit(my_flextable)

# Display the flextable
my_flextable

Chromosome

Window_Start

SNPs

1

140,000,000

AX-583515734

2

30,000,000

AX-584444558

2

50,000,000

AX-584502184

2

180,000,000

AX-585196879

2

320,000,000

AX-584907866

2

380,000,000

AX-579548089

2

390,000,000

AX-579560686, AX-579584369, AX-579583040, AX-579584883

2

400,000,000

AX-579604213, AX-579607140, AX-579619995

2

410,000,000

AX-579630465, AX-579632196, AX-579632983, AX-579632927, AX-579661979, AX-579667335

2

510,000,000

AX-579987164

2

560,000,000

AX-580192850, AX-580195612

3

0

AX-580344997

3

180,000,000

AX-581275893

3

190,000,000

AX-581302901

3

210,000,000

AX-581437212, AX-581438408, AX-581442470, AX-581461793, AX-581462648, AX-581467653

3

220,000,000

AX-581504582, AX-581512892

3

230,000,000

AX-581527485, AX-581534474

# Initialize a Word document
doc <- read_docx()

# Add flextable to Word document
doc <- body_add_flextable(doc, value = my_flextable)

# Save the Word document
print(doc, target = here("output", "pcadapt", "snps_pcadap_outflank.docx"))

3. pcadapt with all 2 populations (MAN and AUT)

pcadapt performs principal component analysis and computes p-values to test for outliers. The test for outliers is based on the correlations between genetic variation and the first K principal components. pcadapt also handles Pool-seq data for which the statistical analysis is performed on the genetic markers frequencies. Returns an object of class pcadapt.

Clean env and memory

# Remove all objects from the environment
rm(list = ls())

# Run the garbage collector to free up memory
gc()
##           used  (Mb) gc trigger  (Mb) limit (Mb) max used  (Mb)
## Ncells 3134638 167.5    4677312 249.8         NA  4677312 249.8
## Vcells 6506499  49.7   17479945 133.4      32768 17479945 133.4
# read the file
x <- read.pcadapt(here("output", "outflank","man_aut.bed"), type = "bed")
# Run pcadapt with a large number of PCs
pcadapt_res <- pcadapt(x, K = 30)

# Create a scree plot
plot(pcadapt_res, option = "screeplot")

Choose PCs

# perform the analysis
pcadapt_res <- pcadapt(
  x, 
  method = c("mahalanobis"),
  min.maf = 0.1,
  LD.clumping = NULL,
  tol = 1e-04,
  K = 2) # choose the right K for your data

str(pcadapt_res)
## List of 11
##  $ scores         : num [1:38, 1:2] -0.258 -0.295 -0.291 -0.275 -0.191 ...
##  $ singular.values: num [1:2] 0.386 0.326
##  $ loadings       : num [1:41100, 1:2] NA NA -0.00249 -0.00146 -0.00379 ...
##  $ zscores        : num [1:41100, 1:2] NA NA -1.11 -0.665 -1.531 ...
##  $ af             : num [1:41100] 0.958 0.973 0.667 0.829 0.566 ...
##  $ maf            : num [1:41100] 0.0417 0.027 0.3333 0.1711 0.4342 ...
##  $ chi2.stat      : num [1:41100] NA NA 0.151 0.311 0.824 ...
##  $ stat           : num [1:41100] NA NA 0.211 0.435 1.154 ...
##  $ gif            : num 1.4
##  $ pvalues        : num [1:41100] NA NA 0.927 0.856 0.662 ...
##  $ pass           : int [1:29986] 3 4 5 6 7 8 9 10 12 14 ...
##  - attr(*, "K")= num 2
##  - attr(*, "method")= chr "mahalanobis"
##  - attr(*, "min.maf")= num 0.1
##  - attr(*, "class")= chr "pcadapt"
#   ____________________________________________________________________________
#   import the bim file with the SNP data                                   ####
snps <-
  read_delim(                    # to learn about the options use here, run ?read_delim on the console.
    here(
      "output", "outflank", "man_aut.bim"
    ),                           # use library here to load it
    col_names      = FALSE,      # we don't have header in the input file
    show_col_types = FALSE,      # suppress message from read_delim
    col_types      = "ccidcc"    # set the class of each column
  )
#
# set column names
colnames(
  snps
) <-                             # to add a header in our tibble
  c(
    "Scaffold", "SNP", "Cm", "Position", "Allele1", "Allele2"
  )
#
# check the tibble
head(snps)
## # A tibble: 6 × 6
##   Scaffold SNP             Cm Position Allele1 Allele2
##   <chr>    <chr>        <int>    <dbl> <chr>   <chr>  
## 1 1        AX-581444870     0    97856 C       T      
## 2 1        AX-583035083     0   305518 A       G      
## 3 1        AX-583035102     0   308124 A       G      
## 4 1        AX-583033342     0   315059 C       G      
## 5 1        AX-583035163     0   315386 A       G      
## 6 1        AX-583035198     0   330908 G       T
# Combine into a data frame
pcadapt_res_df <- data.frame(
  SNP = snps$SNP,
  Chromosome = snps$Scaffold,
  Position = snps$Position,
  pvalues = pcadapt_res$pvalues,
  stat = pcadapt_res$stat
) |> drop_na()
head(pcadapt_res_df)
##            SNP Chromosome Position   pvalues      stat
## 1 AX-583035102          1   308124 0.9273580 0.2113363
## 2 AX-583033342          1   315059 0.8561877 0.4350990
## 3 AX-583035163          1   315386 0.6623355 1.1544955
## 4 AX-583035198          1   330908 0.8934176 0.3158213
## 5 AX-583035257          1   442875 0.2274103 4.1501875
## 6 AX-583033504          1   492687 0.4437341 2.2769426

Adjust

# Adjust the p-values using Benjamini-Hochberg method
padj <- p.adjust(pcadapt_res_df$pvalues,method="BH")

# Define significance level
alpha <- 0.05

# Get indices of significant SNPs
outliers_indices <- which(padj < alpha)

# Get the SNP IDs of the outliers
outlier_snps <- pcadapt_res_df$SNP[outliers_indices]

# Print the SNP IDs
length(outlier_snps)
## [1] 111
# Save it
write.table(
  outlier_snps,
  file = here("output", "pcadapt","man_aut_SNPs_pcadapt.txt"),
  row.names = FALSE,
  quote = FALSE,
  col.names = FALSE,
  sep = "\n"
)
# Set colors for the chromosomes
color_vector <- c("#CCF6D6", "#F6E1CC", "#CCD8F6")  # Add or remove colors as needed.
names(color_vector) <- unique(pcadapt_res_df$Chromosome)  # Make sure unique(df_sub$Chromosome) gives all unique Chromosome values.

# Adjust the p-values using Benjamini-Hochberg method
padj <- p.adjust(pcadapt_res_df$pvalues,method="BH")

# Define significance level
alpha <- 0.05

# Get indices of significant SNPs
outliers_indices <- which(padj < alpha)

# Create a new 'highlight' column, initially set to FALSE
pcadapt_res_df$highlight <- FALSE

# Update the 'highlight' column to highlight these outlier SNPs
pcadapt_res_df$highlight[outliers_indices] <- TRUE

# Identify the SNP with the smallest p-value in each chromosome among the associated SNPs
min_pval_snps <- pcadapt_res_df |>
  filter(highlight) |>
  group_by(Chromosome) |>
  slice(which.min(pvalues))

# Custom label function
k_format <- function(x) {
  paste0(format(x / 1e6, big.mark = "", scientific = FALSE), "k")
}

# source the plotting function
source(
  here(
    "scripts", "analysis", "my_theme2.R"
  )
)

# Create the plot
ggplot(pcadapt_res_df, aes(x = Position, y = -log10(pvalues))) +
  geom_point(aes(color = Chromosome),
             data = subset(pcadapt_res_df, highlight == FALSE),
             size = .5) +
  geom_point(
    color = "magenta",
    data = subset(pcadapt_res_df, highlight == TRUE),
    size = .5
  ) +
  scale_color_manual(values = color_vector, guide = "none") +
  labs(title = "pcadapt Brazil", x = "Position", y = "-log10(p-value)", caption = "SNPs highlighted in magenta are significant after Benjamini-Hochberg adjustment (p<0.05).") +
  facet_wrap(~ Chromosome, scales = "free_x") +
  scale_x_continuous(labels = k_format) +
  my_theme() +
  theme(panel.spacing = unit(.2, "lines"),
        plot.margin = unit(c(1, 1, 2, 2), "lines"),
        plot.caption = element_markdown(face = "italic", color = "#574E4E")) +
  # Annotate only the SNP with the smallest p-value for each chromosome among the associated SNPs
  geom_text_repel(
    data = min_pval_snps,
    aes(label = SNP),
    size = 3,
    nudge_y = 0.01,
    segment.color = NA,
    max.overlaps = Inf # Set to 'Inf' to show all annotations
  )

# Save the plot
ggsave(
  here(
    "output", "pcadapt", "figures", "man_aut_pcadapt.pdf"
  ),
  width  = 8,
  height = 5,
  units  = "in"
)
plot(pcadapt_res, option = "qqplot")

plot(pcadapt_res, option = "scores")

hist(pcadapt_res$pvalues, xlab = "p-values", main = NULL, breaks = 50, col = "pink")

plot(pcadapt_res, option = "stat.distribution")

Choosing a cutoff for outlier detection

q-values

qval <- qvalue(pcadapt_res$pvalues)$qvalues
alpha <- 0.05
outliers <- which(qval < alpha)
length(outliers)
## [1] 111

Benjamini-Hochberg Procedure

padj <- p.adjust(pcadapt_res$pvalues,method="BH")
alpha <- 0.05
outliers <- which(padj < alpha)
length(outliers)
## [1] 111

Bonferroni correction

padj <- p.adjust(pcadapt_res$pvalues,method="bonferroni")
alpha <- 0.05
outliers <- which(padj < alpha)
length(outliers)
## [1] 34

Loadings

par(mfrow = c(2, 2))
for (i in 1:2)
  plot(pcadapt_res$loadings[, i], pch = 19, cex = .3, ylab = paste0("Loadings PC", i))

3.1 Venn diagram outFlank vs pcadapt

# Read data from txt files
outflank_snps <-
  read.table(
    here("output", "outflank", "man_aut_SNPs_outFlank.txt"),
    stringsAsFactors = FALSE
  ) |>
  drop_na()

pcadapt_snps <-
  read.table(
    here("output", "pcadapt", "man_aut_SNPs_pcadapt.txt"),
    stringsAsFactors = FALSE
  )

# Create a list with all dataframes
list_of_clusters <- list("outFlank" = outflank_snps$V1, "pcadapt" = pcadapt_snps$V1)


# Create Venn diagram
venn_diagram <- ggvenn(list_of_clusters, fill_color = c("steelblue", "darkorange"))
print(venn_diagram)

# Find common SNPs
common_SNPs <- Reduce(intersect, list_of_clusters)

# Save the shared SNP ids into a txt file
write.table(
  common_SNPs,
  file = here("output", "pcadapt", "man_aut_common_SNPs_pcadapt_outflank.txt"),
  row.names = FALSE,
  col.names = FALSE,
  quote = FALSE
)

# # Save Venn diagram to PDF
output_path <- here("output", "pcadapt", "figures", "man_aut_significant_snps.pdf")
ggsave(output_path, venn_diagram, height = 5, width = 5, dpi = 300)

With python

Clean env

py_run_string("import gc; gc.collect()")

Venn

import pandas as pd
from matplotlib_venn import venn2
import matplotlib.pyplot as plt

# File paths
path_outflank = 'output/outflank/man_aut_SNPs_outFlank.txt'
path_pcadapt = 'output/pcadapt/man_aut_SNPs_pcadapt.txt'
pdf_output_path = 'output/pcadapt/venn_MAN_AUT_pcadapt_outflank.pdf'  # PDF output file path

# Read the data (assuming the SNP IDs are in the first column)
outflank_snps = pd.read_csv(path_outflank, sep="\t", header=None)
pcadapt_snps = pd.read_csv(path_pcadapt, sep="\t", header=None)

# Drop NA values if any
outflank_snps.dropna(inplace=True)
pcadapt_snps.dropna(inplace=True)

# Create sets from the first column of each dataframe
set_outflank = set(outflank_snps[0])
set_pcadapt = set(pcadapt_snps[0])

# Create a Venn diagram
venn2([set_outflank, set_pcadapt], ('outFlank', 'pcadapt'), set_colors=('yellow', 'gray'))
## <matplotlib_venn._common.VennDiagram object at 0x7fcda216fe80>
# Save the plot in PDF format
plt.savefig(pdf_output_path, format='pdf')

# Show the plot
plt.show()

# Clearing the environment
del path_outflank, path_pcadapt, pdf_output_path, outflank_snps, pcadapt_snps, set_outflank, set_pcadapt
plt.clf()  # Clear the current figure
plt.cla()  # Clear the current axes

# Garbage collection
gc.collect()
## 406

Create plot with the shared SNPs

# Filter the dataframe
filtered_df <- pcadapt_res_df |> filter(SNP %in% common_SNPs)

# Create the plot
ggplot(pcadapt_res_df, aes(x = Position, y = -log10(pvalues))) +
  geom_point(
    aes(color = Chromosome),
    data = subset(pcadapt_res_df, highlight == FALSE),
    size = .3
  ) +
  geom_point(
    color = "magenta",
    data = subset(filtered_df, highlight == TRUE),
    size = .3
  ) +
  scale_color_manual(values = color_vector, guide = "none") +
  labs(x = "Position", y = "-log10(p-value)", caption = "SNPs highlighted in magenta are significant after \n Benjamini-Hochberg adjustment (p<0.001) with pcadapt, \nand were identified as outliers with outFlank") +
  facet_wrap( ~ Chromosome, scales = "free_x") +
  scale_x_continuous(labels = k_format) +
  my_theme() +
  theme(
    panel.spacing = unit(.2, "lines"),
    plot.margin = unit(c(1, 1, 2, 2), "lines"),
    plot.caption = element_text(face = "italic")
  ) +
  # Annotate only the SNP with the smallest p-value for each chromosome among the associated SNPs
  geom_text_repel(
    data = min_pval_snps,
    aes(label = SNP),
    size = 3,
    nudge_y = 0.01,
    segment.color = NA,
    max.overlaps = Inf # Set to 'Inf' to show all annotations
  )

# Save the plot
ggsave(
  here(
    "output", "pcadapt", "figures", "man_aut_pcadapt_outflank.pdf"
  ),
  width  = 8,
  height = 5,
  units  = "in"
)

We can create windows and count how many highlighted SNPs we have in each window. Then we can focus in genomic areas with several SNPs

cluster_df <- filtered_df %>%
  mutate(Window_Start = floor(Position / 10000000) * 10000000) %>%
  group_by(Chromosome, Window_Start) %>%
  summarise(SNP_count = n(), .groups = 'drop')

# Create the plot
ggplot(pcadapt_res_df, aes(x = Position, y = -log10(pvalues))) +
  geom_point(
    aes(color = Chromosome),
    data = subset(pcadapt_res_df, highlight == FALSE),
    size = .3
  ) +
  geom_point(
    color = "magenta",
    data = subset(filtered_df, highlight == TRUE),
    size = .3
  ) +
  geom_vline(
    data = cluster_df,
    aes(xintercept = Window_Start + 5000000),
    color = "lightgray",
    linetype = "dashed",
    linewidth = 0.5
  ) +
  geom_text_repel(
    data = cluster_df,
    aes(x = Window_Start + 5000000, y = 15, label = SNP_count),
    vjust = -1,
    hjust = -0.5
  ) +
  scale_color_manual(values = color_vector, guide = "none") +
  labs(x = "Position", y = "-log10(p-value)", caption = "SNPs highlighted in magenta are significant after \n Benjamini-Hochberg adjustment (p<0.001) with pcadapt, \nand were identified as outliers with outFlank \n Gray bars represent 10Mb windows and \n the number of SNPs per window is annotated in black.") +
  facet_wrap( ~ Chromosome, scales = "free_x") +
  scale_x_continuous(labels = k_format) +
  my_theme() +
  theme(
    panel.spacing = unit(.2, "lines"),
    plot.margin = unit(c(1, 1, 2, 2), "lines"),
    plot.caption = element_text(face = "italic"),
    legend.position = "none" # Remove legend
  ) +
  geom_text_repel(
    data = min_pval_snps,
    aes(label = SNP),
    size = 3,
    nudge_y = 0.01,
    segment.color = NA,
    max.overlaps = Inf # Set to 'Inf' to show all annotations
  )

Check the SNP ids for each cluster

# Create a new dataframe to capture SNPs within each 100kb window for each chromosome
cluster_snps_df <- filtered_df %>%
  mutate(Window_Start = floor(Position / 10000000) * 10000000) %>%
  group_by(Chromosome, Window_Start) %>%
  summarise(SNPs = list(SNP), .groups = 'drop')

# Unnest the SNPs column to show each SNP on its own row
unnested_cluster_snps_df <- cluster_snps_df %>% 
  unnest(SNPs)

# View the unnested data
head(unnested_cluster_snps_df)
## # A tibble: 6 × 3
##   Chromosome Window_Start SNPs        
##   <chr>             <dbl> <chr>       
## 1 1                     0 AX-583054970
## 2 1              10000000 AX-583095890
## 3 1              90000000 AX-583324654
## 4 1             120000000 AX-583423493
## 5 1             120000000 AX-583426050
## 6 1             130000000 AX-583467551
# Convert the list column to a string column
cluster_snps_df_str <- cluster_snps_df %>%
  mutate(SNPs = sapply(SNPs, function(x) paste(x, collapse = ", ")))

# Create the flextable
my_flextable <- flextable(cluster_snps_df_str, cwidth = 2.75)

my_flextable <- autofit(my_flextable)

# Display the flextable
my_flextable

Chromosome

Window_Start

SNPs

1

0

AX-583054970

1

10,000,000

AX-583095890

1

90,000,000

AX-583324654

1

120,000,000

AX-583423493, AX-583426050

1

130,000,000

AX-583467551, AX-583504302

1

140,000,000

AX-583514148, AX-583515734, AX-583518994, AX-583517999

1

170,000,000

AX-583631472

1

270,000,000

AX-583905588, AX-583924569

1

340,000,000

AX-584133664

2

20,000,000

AX-584419201

2

30,000,000

AX-584444558, AX-584453518

2

40,000,000

AX-584473131

2

50,000,000

AX-585010439, AX-584502184, AX-585023213

2

70,000,000

AX-585042322

2

80,000,000

AX-584546335

2

110,000,000

AX-585101539

2

170,000,000

AX-585182739

2

180,000,000

AX-585190366, AX-585196879

2

200,000,000

AX-582494788, AX-582519052, AX-582532563

2

260,000,000

AX-584765066

2

320,000,000

AX-585413917

2

350,000,000

AX-584951049

2

360,000,000

AX-579474142, AX-579474650

2

370,000,000

AX-579509845

2

380,000,000

AX-579548089

2

390,000,000

AX-579560686, AX-579580051, AX-579584369, AX-579583040, AX-579584883

2

400,000,000

AX-579604213, AX-579603553, AX-579607140, AX-579606795, AX-579619995, AX-579620627

2

410,000,000

AX-579630462, AX-579630465, AX-579632196, AX-579632927, AX-579636106, AX-579638540, AX-579640459, AX-579646083, AX-579661979, AX-579662371

2

450,000,000

AX-579768613, AX-579778684

2

460,000,000

AX-579808128, AX-579816168

2

490,000,000

AX-579909895

2

510,000,000

AX-579955661, AX-579982534

2

520,000,000

AX-580001880, AX-580026657, AX-580027537, AX-580042929

2

540,000,000

AX-580099245

2

550,000,000

AX-580165355

2

560,000,000

AX-580195612

2

580,000,000

AX-580318010, AX-580321888

3

0

AX-580342672

3

20,000,000

AX-580425125

3

70,000,000

AX-580620056

3

80,000,000

AX-580673467

3

90,000,000

AX-580719743

3

130,000,000

AX-580926966, AX-580930203

3

180,000,000

AX-581275893

3

190,000,000

AX-581298415, AX-581302704

3

200,000,000

AX-581399096

3

210,000,000

AX-581408938, AX-581413330, AX-581437212, AX-581438408, AX-581442470, AX-581458697, AX-581461793, AX-581462648, AX-581467653, AX-581470736

3

220,000,000

AX-581485809, AX-581488185

3

230,000,000

AX-581572119

3

280,000,000

AX-581761610

3

350,000,000

AX-582085824

3

380,000,000

AX-582237115

3

440,000,000

AX-582822062, AX-582853985

3

450,000,000

AX-582884221

3

480,000,000

AX-583025646

# Initialize a Word document
doc <- read_docx()

# Add flextable to Word document
doc <- body_add_flextable(doc, value = my_flextable)

# Save the Word document
print(doc, target = here("output", "pcadapt", "man_aut_snps_pcadap_outflank.docx"))

4. pcadapt with all 2 populations (NEW and AUT)

pcadapt performs principal component analysis and computes p-values to test for outliers. The test for outliers is based on the correlations between genetic variation and the first K principal components. pcadapt also handles Pool-seq data for which the statistical analysis is performed on the genetic markers frequencies. Returns an object of class pcadapt.

Clean env and memory

# Remove all objects from the environment
rm(list = ls())

# Run the garbage collector to free up memory
gc()
##           used  (Mb) gc trigger  (Mb) limit (Mb) max used  (Mb)
## Ncells 3133463 167.4    4677312 249.8         NA  4677312 249.8
## Vcells 6176454  47.2   21055934 160.7      32768 21055500 160.7
# read the file
x <- read.pcadapt(here("output", "outflank","new_aut.bed"), type = "bed")
# Run pcadapt with a large number of PCs
pcadapt_res <- pcadapt(x, K = 30)

# Create a scree plot
plot(pcadapt_res, option = "screeplot")

Choose PCs

# perform the analysis
pcadapt_res <- pcadapt(
  x, 
  method = c("mahalanobis"),
  min.maf = 0.1,
  LD.clumping = NULL,
  tol = 1e-04,
  K = 2) # choose the right K for your data

str(pcadapt_res)
## List of 11
##  $ scores         : num [1:50, 1:2] -0.1556 -0.1445 -0.0718 -0.116 -0.1717 ...
##  $ singular.values: num [1:2] 0.408 0.293
##  $ loadings       : num [1:41468, 1:2] -0.00753 NA 0.00585 0.00398 0.00379 ...
##  $ zscores        : num [1:41468, 1:2] -5.38 NA 2.9 2.27 1.94 ...
##  $ af             : num [1:41468] 0.888 0.967 0.728 0.86 0.6 ...
##  $ maf            : num [1:41468] 0.1122 0.0326 0.2717 0.14 0.4 ...
##  $ chi2.stat      : num [1:41468] 6.86 NA 0.892 0.624 0.381 ...
##  $ stat           : num [1:41468] 9.431 NA 1.227 0.857 0.524 ...
##  $ gif            : num 1.37
##  $ pvalues        : num [1:41468] 0.0324 NA 0.6401 0.7322 0.8265 ...
##  $ pass           : int [1:30833] 1 3 4 5 6 7 8 9 10 12 ...
##  - attr(*, "K")= num 2
##  - attr(*, "method")= chr "mahalanobis"
##  - attr(*, "min.maf")= num 0.1
##  - attr(*, "class")= chr "pcadapt"
#   ____________________________________________________________________________
#   import the bim file with the SNP data                                   ####
snps <-
  read_delim(                    # to learn about the options use here, run ?read_delim on the console.
    here(
      "output", "outflank", "new_aut.bim"
    ),                           # use library here to load it
    col_names      = FALSE,      # we don't have header in the input file
    show_col_types = FALSE,      # suppress message from read_delim
    col_types      = "ccidcc"    # set the class of each column
  )
#
# set column names
colnames(
  snps
) <-                             # to add a header in our tibble
  c(
    "Scaffold", "SNP", "Cm", "Position", "Allele1", "Allele2"
  )
#
# check the tibble
head(snps)
## # A tibble: 6 × 6
##   Scaffold SNP             Cm Position Allele1 Allele2
##   <chr>    <chr>        <int>    <dbl> <chr>   <chr>  
## 1 1        AX-581444870     0    97856 C       T      
## 2 1        AX-583035083     0   305518 A       G      
## 3 1        AX-583035102     0   308124 A       G      
## 4 1        AX-583033342     0   315059 C       G      
## 5 1        AX-583035163     0   315386 A       G      
## 6 1        AX-583035198     0   330908 G       T
# Combine into a data frame
pcadapt_res_df <- data.frame(
  SNP = snps$SNP,
  Chromosome = snps$Scaffold,
  Position = snps$Position,
  pvalues = pcadapt_res$pvalues,
  stat = pcadapt_res$stat
) |> drop_na()
head(pcadapt_res_df)
##            SNP Chromosome Position    pvalues      stat
## 1 AX-581444870          1    97856 0.03239121 9.4305815
## 2 AX-583035102          1   308124 0.64013391 1.2265118
## 3 AX-583033342          1   315059 0.73215941 0.8571903
## 4 AX-583035163          1   315386 0.82646667 0.5240517
## 5 AX-583035198          1   330908 0.55989336 1.5947615
## 6 AX-583035257          1   442875 0.28774446 3.4250617

Adjust

# Adjust the p-values using Benjamini-Hochberg method
padj <- p.adjust(pcadapt_res_df$pvalues,method="BH")

# Define significance level
alpha <- 0.05

# Get indices of significant SNPs
outliers_indices <- which(padj < alpha)

# Get the SNP IDs of the outliers
outlier_snps <- pcadapt_res_df$SNP[outliers_indices]

# Print the SNP IDs
length(outlier_snps)
## [1] 67
# Save it
write.table(
  outlier_snps,
  file = here("output", "pcadapt","new_aut_SNPs_pcadapt.txt"),
  row.names = FALSE,
  quote = FALSE,
  col.names = FALSE,
  sep = "\n"
)
# Set colors for the chromosomes
color_vector <- c("#CCF6D6", "#F6E1CC", "#CCD8F6")  # Add or remove colors as needed.
names(color_vector) <- unique(pcadapt_res_df$Chromosome)  # Make sure unique(df_sub$Chromosome) gives all unique Chromosome values.

# Adjust the p-values using Benjamini-Hochberg method
padj <- p.adjust(pcadapt_res_df$pvalues,method="BH")

# Define significance level
alpha <- 0.05

# Get indices of significant SNPs
outliers_indices <- which(padj < alpha)

# Create a new 'highlight' column, initially set to FALSE
pcadapt_res_df$highlight <- FALSE

# Update the 'highlight' column to highlight these outlier SNPs
pcadapt_res_df$highlight[outliers_indices] <- TRUE

# Identify the SNP with the smallest p-value in each chromosome among the associated SNPs
min_pval_snps <- pcadapt_res_df |>
  filter(highlight) |>
  group_by(Chromosome) |>
  slice(which.min(pvalues))

# Custom label function
k_format <- function(x) {
  paste0(format(x / 1e6, big.mark = "", scientific = FALSE), "k")
}

# source the plotting function
source(
  here(
    "scripts", "analysis", "my_theme2.R"
  )
)

# Create the plot
ggplot(pcadapt_res_df, aes(x = Position, y = -log10(pvalues))) +
  geom_point(aes(color = Chromosome),
             data = subset(pcadapt_res_df, highlight == FALSE),
             size = .5) +
  geom_point(
    color = "magenta",
    data = subset(pcadapt_res_df, highlight == TRUE),
    size = .5
  ) +
  scale_color_manual(values = color_vector, guide = "none") +
  labs(title = "pcadapt Brazil", x = "Position", y = "-log10(p-value)", caption = "SNPs highlighted in magenta are significant after Benjamini-Hochberg adjustment (p<0.05).") +
  facet_wrap(~ Chromosome, scales = "free_x") +
  scale_x_continuous(labels = k_format) +
  my_theme() +
  theme(panel.spacing = unit(.2, "lines"),
        plot.margin = unit(c(1, 1, 2, 2), "lines"),
        plot.caption = element_markdown(face = "italic", color = "#574E4E")) +
  # Annotate only the SNP with the smallest p-value for each chromosome among the associated SNPs
  geom_text_repel(
    data = min_pval_snps,
    aes(label = SNP),
    size = 3,
    nudge_y = 0.01,
    segment.color = NA,
    max.overlaps = Inf # Set to 'Inf' to show all annotations
  )

# Save the plot
ggsave(
  here(
    "output", "pcadapt", "figures", "new_aut_pcadapt.pdf"
  ),
  width  = 8,
  height = 5,
  units  = "in"
)
plot(pcadapt_res, option = "qqplot")

plot(pcadapt_res, option = "scores")

hist(pcadapt_res$pvalues, xlab = "p-values", main = NULL, breaks = 50, col = "pink")

plot(pcadapt_res, option = "stat.distribution")

Choosing a cutoff for outlier detection

q-values

qval <- qvalue(pcadapt_res$pvalues)$qvalues
alpha <- 0.05
outliers <- which(qval < alpha)
length(outliers)
## [1] 67

Benjamini-Hochberg Procedure

padj <- p.adjust(pcadapt_res$pvalues,method="BH")
alpha <- 0.05
outliers <- which(padj < alpha)
length(outliers)
## [1] 67

Bonferroni correction

padj <- p.adjust(pcadapt_res$pvalues,method="bonferroni")
alpha <- 0.05
outliers <- which(padj < alpha)
length(outliers)
## [1] 17

Loadings

par(mfrow = c(2, 2))
for (i in 1:2)
  plot(pcadapt_res$loadings[, i], pch = 19, cex = .3, ylab = paste0("Loadings PC", i))

4.1 Venn diagram outFlank vs pcadapt

# Read data from txt files
outflank_snps <-
  read.table(
    here("output", "outflank", "new_aut_SNPs_outFlank.txt"),
    stringsAsFactors = FALSE
  ) |>
  drop_na()

pcadapt_snps <-
  read.table(
    here("output", "pcadapt", "new_aut_SNPs_pcadapt.txt"),
    stringsAsFactors = FALSE
  )

# Create a list with all dataframes
list_of_clusters <- list("outFlank" = outflank_snps$V1, "pcadapt" = pcadapt_snps$V1)


# Create Venn diagram
venn_diagram <- ggvenn(list_of_clusters, fill_color = c("steelblue", "darkorange"))
print(venn_diagram)

# Find common SNPs
common_SNPs <- Reduce(intersect, list_of_clusters)

# Save the shared SNP ids into a txt file
write.table(
  common_SNPs,
  file = here("output", "pcadapt", "new_aut_common_SNPs_pcadapt_outflank.txt"),
  row.names = FALSE,
  col.names = FALSE,
  quote = FALSE
)

# # Save Venn diagram to PDF
output_path <- here("output", "pcadapt", "figures", "new_aut_significant_snps.pdf")
ggsave(output_path, venn_diagram, height = 5, width = 5, dpi = 300)

With python

Clean env

py_run_string("import gc; gc.collect()")

Venn

import pandas as pd
from matplotlib_venn import venn2
import matplotlib.pyplot as plt

# File paths
path_outflank = 'output/outflank/new_aut_SNPs_outFlank.txt'
path_pcadapt = 'output/pcadapt/new_aut_SNPs_pcadapt.txt'
pdf_output_path = 'output/pcadapt/venn_NEW_AUT_pcadapt_outflank.pdf'  # PDF output file path

# Read the data (assuming the SNP IDs are in the first column)
outflank_snps = pd.read_csv(path_outflank, sep="\t", header=None)
pcadapt_snps = pd.read_csv(path_pcadapt, sep="\t", header=None)

# Drop NA values if any
outflank_snps.dropna(inplace=True)
pcadapt_snps.dropna(inplace=True)

# Create sets from the first column of each dataframe
set_outflank = set(outflank_snps[0])
set_pcadapt = set(pcadapt_snps[0])

# Create a Venn diagram
venn2([set_outflank, set_pcadapt], ('outFlank', 'pcadapt'), set_colors=('green', 'gray'))
## <matplotlib_venn._common.VennDiagram object at 0x7fcdc0dabd90>
# Save the plot in PDF format
plt.savefig(pdf_output_path, format='pdf')

# Show the plot
plt.show()

# Clearing the environment
del path_outflank, path_pcadapt, pdf_output_path, outflank_snps, pcadapt_snps, set_outflank, set_pcadapt
plt.clf()  # Clear the current figure
plt.cla()  # Clear the current axes

# Garbage collection
gc.collect()
## 401

Create plot with the shared SNPs

# Filter the dataframe
filtered_df <- pcadapt_res_df |> filter(SNP %in% common_SNPs)

# Create the plot
ggplot(pcadapt_res_df, aes(x = Position, y = -log10(pvalues))) +
  geom_point(
    aes(color = Chromosome),
    data = subset(pcadapt_res_df, highlight == FALSE),
    size = .3
  ) +
  geom_point(
    color = "magenta",
    data = subset(filtered_df, highlight == TRUE),
    size = .3
  ) +
  scale_color_manual(values = color_vector, guide = "none") +
  labs(x = "Position", y = "-log10(p-value)", caption = "SNPs highlighted in magenta are significant after \n Benjamini-Hochberg adjustment (p<0.001) with pcadapt, \nand were identified as outliers with outFlank") +
  facet_wrap( ~ Chromosome, scales = "free_x") +
  scale_x_continuous(labels = k_format) +
  my_theme() +
  theme(
    panel.spacing = unit(.2, "lines"),
    plot.margin = unit(c(1, 1, 2, 2), "lines"),
    plot.caption = element_text(face = "italic")
  ) +
  # Annotate only the SNP with the smallest p-value for each chromosome among the associated SNPs
  geom_text_repel(
    data = min_pval_snps,
    aes(label = SNP),
    size = 3,
    nudge_y = 0.01,
    segment.color = NA,
    max.overlaps = Inf # Set to 'Inf' to show all annotations
  )

# Save the plot
ggsave(
  here(
    "output", "pcadapt", "figures", "new_aut_pcadapt_outflank.pdf"
  ),
  width  = 8,
  height = 5,
  units  = "in"
)

We can create windows and count how many highlighted SNPs we have in each window. Then we can focus in genomic areas with several SNPs

cluster_df <- filtered_df %>%
  mutate(Window_Start = floor(Position / 10000000) * 10000000) %>%
  group_by(Chromosome, Window_Start) %>%
  summarise(SNP_count = n(), .groups = 'drop')

# Create the plot
ggplot(pcadapt_res_df, aes(x = Position, y = -log10(pvalues))) +
  geom_point(
    aes(color = Chromosome),
    data = subset(pcadapt_res_df, highlight == FALSE),
    size = .3
  ) +
  geom_point(
    color = "magenta",
    data = subset(filtered_df, highlight == TRUE),
    size = .3
  ) +
  geom_vline(
    data = cluster_df,
    aes(xintercept = Window_Start + 5000000),
    color = "lightgray",
    linetype = "dashed",
    linewidth = 0.5
  ) +
  geom_text_repel(
    data = cluster_df,
    aes(x = Window_Start + 5000000, y = 15, label = SNP_count),
    vjust = -1,
    hjust = -0.5
  ) +
  scale_color_manual(values = color_vector, guide = "none") +
  labs(x = "Position", y = "-log10(p-value)", caption = "SNPs highlighted in magenta are significant after \n Benjamini-Hochberg adjustment (p<0.001) with pcadapt, \nand were identified as outliers with outFlank \n Gray bars represent 10Mb windows and \n the number of SNPs per window is annotated in black.") +
  facet_wrap( ~ Chromosome, scales = "free_x") +
  scale_x_continuous(labels = k_format) +
  my_theme() +
  theme(
    panel.spacing = unit(.2, "lines"),
    plot.margin = unit(c(1, 1, 2, 2), "lines"),
    plot.caption = element_text(face = "italic"),
    legend.position = "none" # Remove legend
  ) +
  geom_text_repel(
    data = min_pval_snps,
    aes(label = SNP),
    size = 3,
    nudge_y = 0.01,
    segment.color = NA,
    max.overlaps = Inf # Set to 'Inf' to show all annotations
  )

Check the SNP ids for each cluster

# Create a new dataframe to capture SNPs within each 100kb window for each chromosome
cluster_snps_df <- filtered_df %>%
  mutate(Window_Start = floor(Position / 10000000) * 10000000) %>%
  group_by(Chromosome, Window_Start) %>%
  summarise(SNPs = list(SNP), .groups = 'drop')

# Unnest the SNPs column to show each SNP on its own row
unnested_cluster_snps_df <- cluster_snps_df %>% 
  unnest(SNPs)

# View the unnested data
head(unnested_cluster_snps_df)
## # A tibble: 6 × 3
##   Chromosome Window_Start SNPs        
##   <chr>             <dbl> <chr>       
## 1 1             140000000 AX-583515734
## 2 1             140000000 AX-583518586
## 3 1             140000000 AX-583516491
## 4 1             140000000 AX-583517344
## 5 1             140000000 AX-583521326
## 6 1             140000000 AX-583531703
# Convert the list column to a string column
cluster_snps_df_str <- cluster_snps_df %>%
  mutate(SNPs = sapply(SNPs, function(x) paste(x, collapse = ", ")))

# Create the flextable
my_flextable <- flextable(cluster_snps_df_str, cwidth = 2.75)

my_flextable <- autofit(my_flextable)

# Display the flextable
my_flextable

Chromosome

Window_Start

SNPs

1

140,000,000

AX-583515734, AX-583518586, AX-583516491, AX-583517344, AX-583521326, AX-583531703

1

220,000,000

AX-583750740, AX-583750871

1

260,000,000

AX-583882507, AX-583878485

1

270,000,000

AX-583924978

1

300,000,000

AX-583972796

2

20,000,000

AX-584399488, AX-584419201

2

30,000,000

AX-584429924

2

120,000,000

AX-585110132

2

180,000,000

AX-585196879

2

190,000,000

AX-582457468, AX-582465641

2

200,000,000

AX-582535272

2

320,000,000

AX-584907866

2

380,000,000

AX-579548089

2

390,000,000

AX-579556477, AX-579560686, AX-579564292, AX-579565469, AX-579584369, AX-579583040, AX-579584883

2

400,000,000

AX-579596233, AX-579602153, AX-579618571

2

410,000,000

AX-579630465, AX-579632196, AX-579632983, AX-579632074, AX-579632927, AX-579661979

2

420,000,000

AX-579681911, AX-579693902, AX-579697016

2

500,000,000

AX-579940789

2

570,000,000

AX-580238721

3

0

AX-580344997

3

10,000,000

AX-580398903

3

30,000,000

AX-580480108

3

40,000,000

AX-580560428

3

50,000,000

AX-580575055

3

130,000,000

AX-580917486, AX-580925458

3

150,000,000

AX-581030099

3

180,000,000

AX-581275893

3

190,000,000

AX-581302901

3

210,000,000

AX-581428225, AX-581437212, AX-581438408, AX-581442470, AX-581462648, AX-581467653

3

230,000,000

AX-581528715, AX-581534474, AX-581543825

3

290,000,000

AX-581810815

3

320,000,000

AX-581929530

# Initialize a Word document
doc <- read_docx()

# Add flextable to Word document
doc <- body_add_flextable(doc, value = my_flextable)

# Save the Word document
print(doc, target = here("output", "pcadapt", "new_aut_snps_pcadap_outflank.docx"))

5. pcadapt with all 2 populations (NEW and MAN)

pcadapt performs principal component analysis and computes p-values to test for outliers. The test for outliers is based on the correlations between genetic variation and the first K principal components. pcadapt also handles Pool-seq data for which the statistical analysis is performed on the genetic markers frequencies. Returns an object of class pcadapt.

Clean env and memory

# Remove all objects from the environment
rm(list = ls())

# Run the garbage collector to free up memory
gc()
##           used  (Mb) gc trigger  (Mb) limit (Mb) max used  (Mb)
## Ncells 3134946 167.5    4677312 249.8         NA  4677312 249.8
## Vcells 5953847  45.5   21055934 160.7      32768 21055934 160.7
# read the file
x <- read.pcadapt(here("output", "outflank","man_new.bed"), type = "bed")
# Run pcadapt with a large number of PCs
pcadapt_res <- pcadapt(x, K = 30)

# Create a scree plot
plot(pcadapt_res, option = "screeplot")

Choose PCs

# perform the analysis
pcadapt_res <- pcadapt(
  x, 
  method = c("mahalanobis"),
  min.maf = 0.1,
  LD.clumping = NULL,
  tol = 1e-04,
  K = 3) # choose the right K for your data

str(pcadapt_res)
## List of 11
##  $ scores         : num [1:32, 1:3] -0.0626 -0.1343 -0.0743 -0.1028 -0.0451 ...
##  $ singular.values: num [1:3] 0.361 0.335 0.26
##  $ loadings       : num [1:40985, 1:3] 0.00293 NA 0.0055 0.0033 0.00196 ...
##  $ zscores        : num [1:40985, 1:3] 1.298 NA 1.979 1.322 0.746 ...
##  $ af             : num [1:40985] 0.774 0.981 0.783 0.891 0.703 ...
##  $ maf            : num [1:40985] 0.2258 0.0185 0.2167 0.1094 0.2969 ...
##  $ chi2.stat      : num [1:40985] 1.751 NA 2.479 1.159 0.432 ...
##  $ stat           : num [1:40985] 2.208 NA 3.127 1.462 0.545 ...
##  $ gif            : num 1.26
##  $ pvalues        : num [1:40985] 0.626 NA 0.479 0.763 0.934 ...
##  $ pass           : int [1:33022] 1 3 4 5 6 7 8 9 10 12 ...
##  - attr(*, "K")= num 3
##  - attr(*, "method")= chr "mahalanobis"
##  - attr(*, "min.maf")= num 0.1
##  - attr(*, "class")= chr "pcadapt"
#   ____________________________________________________________________________
#   import the bim file with the SNP data                                   ####
snps <-
  read_delim(                    # to learn about the options use here, run ?read_delim on the console.
    here(
      "output", "outflank", "man_new.bim"
    ),                           # use library here to load it
    col_names      = FALSE,      # we don't have header in the input file
    show_col_types = FALSE,      # suppress message from read_delim
    col_types      = "ccidcc"    # set the class of each column
  )
#
# set column names
colnames(
  snps
) <-                             # to add a header in our tibble
  c(
    "Scaffold", "SNP", "Cm", "Position", "Allele1", "Allele2"
  )
#
# check the tibble
head(snps)
## # A tibble: 6 × 6
##   Scaffold SNP             Cm Position Allele1 Allele2
##   <chr>    <chr>        <int>    <dbl> <chr>   <chr>  
## 1 1        AX-581444870     0    97856 C       T      
## 2 1        AX-583035083     0   305518 A       G      
## 3 1        AX-583035102     0   308124 A       G      
## 4 1        AX-583033342     0   315059 C       G      
## 5 1        AX-583035163     0   315386 A       G      
## 6 1        AX-583035198     0   330908 G       T
# Combine into a data frame
pcadapt_res_df <- data.frame(
  SNP = snps$SNP,
  Chromosome = snps$Scaffold,
  Position = snps$Position,
  pvalues = pcadapt_res$pvalues,
  stat = pcadapt_res$stat
) |> drop_na()
head(pcadapt_res_df)
##            SNP Chromosome Position   pvalues      stat
## 1 AX-581444870          1    97856 0.6257264 2.2084728
## 2 AX-583035102          1   308124 0.4791261 3.1270554
## 3 AX-583033342          1   315059 0.7628384 1.4621540
## 4 AX-583035163          1   315386 0.9335466 0.5450194
## 5 AX-583035198          1   330908 0.8435517 1.0403490
## 6 AX-583035257          1   442875 0.8339275 1.0908654

Adjust

# Adjust the p-values using Benjamini-Hochberg method
padj <- p.adjust(pcadapt_res_df$pvalues,method="BH")

# Define significance level
alpha <- 0.05

# Get indices of significant SNPs
outliers_indices <- which(padj < alpha)

# Get the SNP IDs of the outliers
outlier_snps <- pcadapt_res_df$SNP[outliers_indices]

# Print the SNP IDs
length(outlier_snps)
## [1] 79
# Save it
write.table(
  outlier_snps,
  file = here("output", "pcadapt","man_new_SNPs_pcadapt.txt"),
  row.names = FALSE,
  quote = FALSE,
  col.names = FALSE,
  sep = "\n"
)
# Set colors for the chromosomes
color_vector <- c("#CCF6D6", "#F6E1CC", "#CCD8F6")  # Add or remove colors as needed.
names(color_vector) <- unique(pcadapt_res_df$Chromosome)  # Make sure unique(df_sub$Chromosome) gives all unique Chromosome values.

# Adjust the p-values using Benjamini-Hochberg method
padj <- p.adjust(pcadapt_res_df$pvalues,method="BH")

# Define significance level
alpha <- 0.05

# Get indices of significant SNPs
outliers_indices <- which(padj < alpha)

# Create a new 'highlight' column, initially set to FALSE
pcadapt_res_df$highlight <- FALSE

# Update the 'highlight' column to highlight these outlier SNPs
pcadapt_res_df$highlight[outliers_indices] <- TRUE

# Identify the SNP with the smallest p-value in each chromosome among the associated SNPs
min_pval_snps <- pcadapt_res_df |>
  filter(highlight) |>
  group_by(Chromosome) |>
  slice(which.min(pvalues))

# Custom label function
k_format <- function(x) {
  paste0(format(x / 1e6, big.mark = "", scientific = FALSE), "k")
}

# source the plotting function
source(
  here(
    "scripts", "analysis", "my_theme2.R"
  )
)

# Create the plot
ggplot(pcadapt_res_df, aes(x = Position, y = -log10(pvalues))) +
  geom_point(aes(color = Chromosome),
             data = subset(pcadapt_res_df, highlight == FALSE),
             size = .5) +
  geom_point(
    color = "magenta",
    data = subset(pcadapt_res_df, highlight == TRUE),
    size = .5
  ) +
  scale_color_manual(values = color_vector, guide = "none") +
  labs(title = "pcadapt Brazil", x = "Position", y = "-log10(p-value)", caption = "SNPs highlighted in magenta are significant after Benjamini-Hochberg adjustment (p<0.05).") +
  facet_wrap(~ Chromosome, scales = "free_x") +
  scale_x_continuous(labels = k_format) +
  my_theme() +
  theme(panel.spacing = unit(.2, "lines"),
        plot.margin = unit(c(1, 1, 2, 2), "lines"),
        plot.caption = element_markdown(face = "italic", color = "#574E4E")) +
  # Annotate only the SNP with the smallest p-value for each chromosome among the associated SNPs
  geom_text_repel(
    data = min_pval_snps,
    aes(label = SNP),
    size = 3,
    nudge_y = 0.01,
    segment.color = NA,
    max.overlaps = Inf # Set to 'Inf' to show all annotations
  )

# Save the plot
ggsave(
  here(
    "output", "pcadapt", "figures", "man_new_pcadapt.pdf"
  ),
  width  = 8,
  height = 5,
  units  = "in"
)
plot(pcadapt_res, option = "qqplot")

plot(pcadapt_res, option = "scores")

hist(pcadapt_res$pvalues, xlab = "p-values", main = NULL, breaks = 50, col = "pink")

plot(pcadapt_res, option = "stat.distribution")

Choosing a cutoff for outlier detection

q-values

qval <- qvalue(pcadapt_res$pvalues)$qvalues
alpha <- 0.05
outliers <- which(qval < alpha)
length(outliers)
## [1] 88

Benjamini-Hochberg Procedure

padj <- p.adjust(pcadapt_res$pvalues,method="BH")
alpha <- 0.05
outliers <- which(padj < alpha)
length(outliers)
## [1] 79

Bonferroni correction

padj <- p.adjust(pcadapt_res$pvalues,method="bonferroni")
alpha <- 0.05
outliers <- which(padj < alpha)
length(outliers)
## [1] 21

Loadings

par(mfrow = c(2, 2))
for (i in 1:2)
  plot(pcadapt_res$loadings[, i], pch = 19, cex = .3, ylab = paste0("Loadings PC", i))

5.1 Venn diagram outFlank vs pcadapt

We can make a venn diagram because outFlank did not identify any outlier.

6. Venn diagram

# Read data from txt files
NEW_MAN_AUT <-
  read.table(
    here("output", "pcadapt", "common_SNPs_pcadapt_outflank.txt"),
    stringsAsFactors = FALSE
  ) |>
  drop_na()

MAN_AUT <-
  read.table(
    here("output", "pcadapt", "man_aut_common_SNPs_pcadapt_outflank.txt"),
    stringsAsFactors = FALSE
  )


NEW_AUT <-
  read.table(
    here("output", "pcadapt", "new_aut_common_SNPs_pcadapt_outflank.txt"),
    stringsAsFactors = FALSE
  )


NEW_MAN <-
  read.table(
    here("output", "pcadapt", "man_new_SNPs_pcadapt.txt"),
    stringsAsFactors = FALSE
  )

# Create a list with all dataframes
list_of_clusters <- list("NEW_MAN_AUT" = NEW_MAN_AUT$V1, "MAN_AUT" = MAN_AUT$V1, "NEW_AUT" = NEW_AUT$V1, "NEW_MAN" = NEW_MAN$V1)


# Create Venn diagram
venn_diagram <- ggvenn(list_of_clusters)

# Increase plot margins
venn_diagram <- venn_diagram + theme(plot.margin = margin(60, 60, 60, 60, "points"))

# Resize text
venn_diagram <- venn_diagram + theme(text = element_text(size = 5))


# Print the adjusted plot
print(venn_diagram)

# Find the intersect of NEW_MAN_AUT, MAN_AUT, and NEW_AUT
common_elements <- Reduce(intersect, list(NEW_MAN_AUT$V1, MAN_AUT$V1, NEW_AUT$V1))


# # Save the shared SNP ids into a txt file
write.table(
  common_elements,
  file = here("output", "pcadapt", "4_way_venn_common_SNPs_pcadapt_outflank.txt"),
  row.names = FALSE,
  col.names = FALSE,
  quote = FALSE
)

# # Save Venn diagram to PDF
output_path <- here("output", "pcadapt", "figures", "4_way_venn_significant_snps.pdf")
ggsave(output_path, venn_diagram, height = 8, width = 8, dpi = 300)

With python Create plot with the shared SNPs

# Filter the dataframe
filtered_df <- pcadapt_res_df |> 
  filter(SNP %in% common_elements)

# Create the plot
ggplot(pcadapt_res_df, aes(x = Position, y = -log10(pvalues))) +
  geom_point(
    aes(color = Chromosome),
    data = subset(pcadapt_res_df, highlight == FALSE),
    size = .3
  ) +
  geom_point(
    color = "magenta",
    data = subset(filtered_df, highlight == TRUE),
    size = .3
  ) +
  scale_color_manual(values = color_vector, guide = "none") +
  labs(x = "Position", y = "-log10(p-value)") +
  facet_wrap( ~ Chromosome, scales = "free_x") +
  scale_x_continuous(labels = k_format) +
  my_theme() +
  theme(
    panel.spacing = unit(.2, "lines"),
    plot.margin = unit(c(1, 1, 2, 2), "lines"),
    plot.caption = element_text(face = "italic")
  )

We can create windows and count how many highlighted SNPs we have in each window. Then we can focus in genomic areas with several SNPs

cluster_df <- filtered_df %>%
  mutate(Window_Start = floor(Position / 10000000) * 10000000) %>%
  group_by(Chromosome, Window_Start) %>%
  summarise(SNP_count = n(), .groups = 'drop')

# Create the plot
ggplot(pcadapt_res_df, aes(x = Position, y = -log10(pvalues))) +
  geom_point(
    aes(color = Chromosome),
    data = subset(pcadapt_res_df, highlight == FALSE),
    size = .3
  ) +
  geom_point(
    color = "magenta",
    data = subset(filtered_df, highlight == TRUE),
    size = .3
  ) +
  geom_vline(
    data = cluster_df,
    aes(xintercept = Window_Start + 5000000),
    color = "lightgray",
    linetype = "dashed",
    linewidth = 0.5
  ) +
  geom_text_repel(
    data = cluster_df,
    aes(x = Window_Start + 5000000, y = 10, label = SNP_count),
    vjust = -1,
    hjust = -0.5
  ) +
  scale_color_manual(values = color_vector, guide = "none") +
  labs(x = "Position", y = "-log10(p-value)", caption = "SNPs highlighted in magenta are significant after \n Benjamini-Hochberg adjustment (p<0.001) with pcadapt, \nand were identified as outliers with outFlank \n Gray bars represent 10Mb windows and \n the number of SNPs per window is annotated in black.") +
  facet_wrap( ~ Chromosome, scales = "free_x") +
  scale_x_continuous(labels = k_format) +
  my_theme() +
  theme(
    panel.spacing = unit(.2, "lines"),
    plot.margin = unit(c(1, 1, 2, 2), "lines"),
    plot.caption = element_text(face = "italic"),
    legend.position = "none" # Remove legend
  ) 

# # Save Venn diagram to PDF
output_path <- here("output", "pcadapt", "figures", "manhattan_from_4_way_venn_snps.pdf")
ggsave(output_path, height = 5, width = 8, dpi = 300)

Check the SNP ids for each cluster

# Create a new dataframe to capture SNPs within each 100kb window for each chromosome
cluster_snps_df <- filtered_df %>%
  mutate(Window_Start = floor(Position / 10000000) * 10000000) %>%
  group_by(Chromosome, Window_Start) %>%
  summarise(SNPs = list(SNP), .groups = 'drop')

# Unnest the SNPs column to show each SNP on its own row
unnested_cluster_snps_df <- cluster_snps_df %>% 
  unnest(SNPs)

# View the unnested data
head(unnested_cluster_snps_df)
## # A tibble: 6 × 3
##   Chromosome Window_Start SNPs        
##   <chr>             <dbl> <chr>       
## 1 2             180000000 AX-585196879
## 2 2             380000000 AX-579548089
## 3 2             390000000 AX-579560686
## 4 2             410000000 AX-579661979
## 5 3             180000000 AX-581275893
## 6 3             210000000 AX-581442470
# Convert the list column to a string column
cluster_snps_df_str <- cluster_snps_df %>%
  mutate(SNPs = sapply(SNPs, function(x) paste(x, collapse = ", ")))

# Create the flextable
my_flextable <- flextable(cluster_snps_df_str, cwidth = 2.75)

my_flextable <- autofit(my_flextable)

# Display the flextable
my_flextable

Chromosome

Window_Start

SNPs

2

180,000,000

AX-585196879

2

380,000,000

AX-579548089

2

390,000,000

AX-579560686

2

410,000,000

AX-579661979

3

180,000,000

AX-581275893

3

210,000,000

AX-581442470, AX-581462648, AX-581467653

# Initialize a Word document
doc <- read_docx()

# Add flextable to Word document
doc <- body_add_flextable(doc, value = my_flextable)

# Save the Word document
print(doc, target = here("output", "pcadapt", "4_way_venn_pcadap_outflank.docx"))

We can also highlight all the 158 SNPs from the selection scans

# Combine all three data frames into one
combined_df <- bind_rows(MAN_AUT, NEW_AUT, NEW_MAN_AUT)

# Count unique values and add a 'Count' column
result <- combined_df %>%
  group_by(V1) %>%
  summarise(Count = n()) %>%
  arrange(V1) |>
  dplyr::rename(SNP = V1)

# Show the result
head(result)
## # A tibble: 6 × 2
##   SNP          Count
##   <chr>        <int>
## 1 AX-579474142     1
## 2 AX-579474650     1
## 3 AX-579509845     1
## 4 AX-579548089     3
## 5 AX-579556477     1
## 6 AX-579560686     3

Create plot with the shared SNPs

# Filter the dataframe
filtered_df <- pcadapt_res_df |> 
  filter(SNP %in% result$SNP)

# Create the plot
ggplot(pcadapt_res_df, aes(x = Position, y = -log10(pvalues))) +
  geom_point(
    aes(color = Chromosome),
    data = subset(pcadapt_res_df, highlight == FALSE),
    size = .3
  ) +
  geom_point(
    color = "magenta",
    data = subset(filtered_df, highlight == TRUE),
    size = .3
  ) +
  scale_color_manual(values = color_vector, guide = "none") +
  labs(x = "Position", y = "-log10(p-value)") +
  facet_wrap( ~ Chromosome, scales = "free_x") +
  scale_x_continuous(labels = k_format) +
  my_theme() +
  theme(
    panel.spacing = unit(.2, "lines"),
    plot.margin = unit(c(1, 1, 2, 2), "lines"),
    plot.caption = element_text(face = "italic")
  )

We can create windows and count how many highlighted SNPs we have in each window. Then we can focus in genomic areas with several SNPs

cluster_df <- filtered_df |>
  mutate(Window_Start = floor(Position / 10000000) * 10000000) |>
  group_by(Chromosome, Window_Start) |>
  summarise(SNP_count = n(), .groups = 'drop') |>
  dplyr::filter(SNP_count >= 3) # plot lines for windows with 3 or more SNPs

# Create the plot
ggplot(pcadapt_res_df, aes(x = Position, y = -log10(pvalues))) +
  geom_point(
    aes(color = Chromosome),
    data = subset(pcadapt_res_df, highlight == FALSE),
    size = .3
  ) +
  geom_point(
    color = "magenta",
    data = subset(filtered_df, highlight == TRUE),
    size = .3
  ) +
  geom_vline(
    data = cluster_df,
    aes(xintercept = Window_Start + 5000000),
    color = "lightgray",
    linetype = "dashed",
    linewidth = 0.5
  ) +
  geom_text_repel(
    data = cluster_df,
    aes(x = Window_Start + 5000000, y = 10, label = SNP_count),
    vjust = -1,
    hjust = -0.5
  ) +
  scale_color_manual(values = color_vector, guide = "none") +
  labs(x = "Position", y = "-log10(p-value)", caption = "SNPs highlighted in magenta are significant after \n Benjamini-Hochberg adjustment (p<0.001) with pcadapt, \nand were identified as outliers with outFlank \n Gray bars represent 10Mb windows and \n the number of SNPs per window is annotated in black.") +
  facet_wrap( ~ Chromosome, scales = "free_x") +
  scale_x_continuous(labels = k_format) +
  my_theme() +
  theme(
    panel.spacing = unit(.2, "lines"),
    plot.margin = unit(c(1, 1, 2, 2), "lines"),
    plot.caption = element_text(face = "italic"),
    legend.position = "none" # Remove legend
  ) 

# # Save Venn diagram to PDF
output_path <- here("output", "pcadapt", "figures", "manhattan_from_4_way_venn_158_snps.pdf")
ggsave(output_path, height = 5, width = 8, dpi = 300)

Check the SNP ids for each cluster

# Create a new dataframe to capture SNPs within each 100kb window for each chromosome
cluster_snps_df <- filtered_df %>%
  mutate(Window_Start = floor(Position / 10000000) * 10000000) %>%
  group_by(Chromosome, Window_Start) %>%
  summarise(SNPs = list(SNP), .groups = 'drop')

# Unnest the SNPs column to show each SNP on its own row
unnested_cluster_snps_df <- cluster_snps_df %>% 
  unnest(SNPs)

# View the unnested data
head(unnested_cluster_snps_df)
## # A tibble: 6 × 3
##   Chromosome Window_Start SNPs        
##   <chr>             <dbl> <chr>       
## 1 1                     0 AX-583054970
## 2 1              10000000 AX-583095890
## 3 1              90000000 AX-583324654
## 4 1             120000000 AX-583423493
## 5 1             120000000 AX-583426050
## 6 1             130000000 AX-583467551
# Convert the list column to a string column
cluster_snps_df_str <- cluster_snps_df %>%
  mutate(SNPs = sapply(SNPs, function(x) paste(x, collapse = ", ")))

# Create the flextable
my_flextable <- flextable(cluster_snps_df_str, cwidth = 2.75)

my_flextable <- autofit(my_flextable)

# Display the flextable
my_flextable

Chromosome

Window_Start

SNPs

1

0

AX-583054970

1

10,000,000

AX-583095890

1

90,000,000

AX-583324654

1

120,000,000

AX-583423493, AX-583426050

1

130,000,000

AX-583467551, AX-583504302

1

140,000,000

AX-583514148, AX-583518586, AX-583518994, AX-583516491, AX-583517344, AX-583517999, AX-583521326, AX-583531703

1

170,000,000

AX-583631472

1

220,000,000

AX-583750740, AX-583750871

1

260,000,000

AX-583882507, AX-583878485

1

270,000,000

AX-583905588, AX-583924569, AX-583924978

1

300,000,000

AX-583972796

1

340,000,000

AX-584133664

2

20,000,000

AX-584399488, AX-584419201

2

30,000,000

AX-584429924, AX-584444558, AX-584453518

2

40,000,000

AX-584473131

2

50,000,000

AX-585010439, AX-584502184, AX-585023213

2

70,000,000

AX-585042322

2

80,000,000

AX-584546335

2

110,000,000

AX-585101539

2

120,000,000

AX-585110132

2

170,000,000

AX-585182739

2

180,000,000

AX-585190366, AX-585196879

2

190,000,000

AX-582457468, AX-582465641

2

200,000,000

AX-582494788, AX-582519052, AX-582532563, AX-582535272

2

260,000,000

AX-584765066

2

320,000,000

AX-585413917, AX-584907866

2

350,000,000

AX-584951049

2

360,000,000

AX-579474142, AX-579474650

2

370,000,000

AX-579509845

2

380,000,000

AX-579548089

2

390,000,000

AX-579556477, AX-579560686, AX-579564292, AX-579565469, AX-579580051

2

400,000,000

AX-579596233, AX-579602153, AX-579603553, AX-579606795, AX-579618571, AX-579619995, AX-579620627

2

410,000,000

AX-579630462, AX-579632983, AX-579632074, AX-579636106, AX-579638540, AX-579640459, AX-579646083, AX-579661979, AX-579662371, AX-579667335

2

420,000,000

AX-579693902, AX-579697016

2

450,000,000

AX-579768613, AX-579778684

2

460,000,000

AX-579808128

2

490,000,000

AX-579909895

2

500,000,000

AX-579940789

2

510,000,000

AX-579955661, AX-579982534

2

520,000,000

AX-580001880, AX-580026657, AX-580027537, AX-580042929

2

540,000,000

AX-580099245

2

550,000,000

AX-580165355

2

560,000,000

AX-580192850, AX-580195612

2

570,000,000

AX-580238721

2

580,000,000

AX-580318010, AX-580321888

3

0

AX-580342672, AX-580344997

3

10,000,000

AX-580398903

3

20,000,000

AX-580425125

3

40,000,000

AX-580560428

3

50,000,000

AX-580575055

3

70,000,000

AX-580620056

3

80,000,000

AX-580673467

3

90,000,000

AX-580719743

3

130,000,000

AX-580917486, AX-580925458, AX-580926966

3

150,000,000

AX-581030099

3

180,000,000

AX-581275893

3

190,000,000

AX-581298415, AX-581302704, AX-581302901

3

200,000,000

AX-581399096

3

210,000,000

AX-581408938, AX-581413330, AX-581428225, AX-581442470, AX-581458697, AX-581461793, AX-581462648, AX-581467653, AX-581470736

3

220,000,000

AX-581485809, AX-581488185, AX-581504582

3

230,000,000

AX-581528715, AX-581543825, AX-581572119

3

280,000,000

AX-581761610

3

290,000,000

AX-581810815

3

320,000,000

AX-581929530

3

350,000,000

AX-582085824

3

380,000,000

AX-582237115

3

440,000,000

AX-582822062, AX-582853985

3

450,000,000

AX-582884221

3

480,000,000

AX-583025646

# Initialize a Word document
doc <- read_docx()

# Add flextable to Word document
doc <- body_add_flextable(doc, value = my_flextable)

# Save the Word document
print(doc, target = here("output", "pcadapt", "4_way_venn_pcadap_outflank_158_SNPs.docx"))